library("tidyverse")
library("data.table")

big_mac_countries <- c(
    "ARG", "AUS", "BRA", "GBR", "CAN", "CHL", "CHN", "CZE", "DNK",
    "EGY", "HKG", "HUN", "IDN", "ISR", "JPN", "MYS", "MEX", "NZL",
    "NOR", "PER", "PHL", "POL", "RUS", "SAU", "SGP", "ZAF", "KOR",
    "SWE", "CHE", "TWN", "THA", "TUR", "ARE", "USA", "COL", "CRI",
    "PAK", "LKA", "UKR", "URY", "IND", "VNM", "GTM", "HND", "VEN",
    "NIC", "AZE", "BHR", "JOR", "KWT", "LBN", "MDA", "OMN",
    "QAT", "ROU", "EUZ"
)

big_mac_data <- fread("./source-data/big-mac-source-data-v2.csv", na.strings = "#N/A") %>%
    .[!is.na(local_price)] %>% # remove lines where the local price is missing
    .[, GDP_local := as.numeric(GDP_local)] %>% # convert GDP to a number
    .[order(date, name)] # sort by date and then by country name, for easy reading

latest_date <- big_mac_data$date %>% max()

big_mac_data[, dollar_price := local_price / dollar_ex]

base_currencies <- c("USD", "EUR", "GBP", "JPY", "CNY")

big_mac_index <- big_mac_data[
    !is.na(dollar_price) & iso_a3 %in% big_mac_countries,
    .(date, iso_a3, currency_code, name, local_price, dollar_ex, dollar_price)
]

for (currency in base_currencies) {
    big_mac_index[
        , # we don't want a subset, so our first argument is blank
        (currency) := # we'll add a new column named for the base set
            dollar_price / # we divide the dollar price in each row by
            # the dollar price on the *base currency*'s row (.SD is a data.table
            .SD[currency_code == currency]$dollar_price - # that contains only the current group)
            1, # one means parity (neither over- nor under-valued), so we subtract one
        # to get an over/under-valuation value
        by = date # and of course, we'll group these rows by date
    ]
}
big_mac_index[, (base_currencies) := round(.SD, 5L), .SDcols = base_currencies]

to_plot <- big_mac_index[date == latest_date]
to_plot$name <- factor(to_plot$name, levels = to_plot$name[order(to_plot$USD)])

fwrite(big_mac_index, "./output-data/big-mac-raw-index.csv")

big_mac_gdp_data <- big_mac_data[GDP_local > 0]

regression_countries <- c(
    "ARG", "AUS", "BRA", "GBR", "CAN", "CHL", "CHN", "CZE", "DNK",
    "EGY", "EUZ", "HKG", "HUN", "IDN", "ISR", "JPN", "MYS", "MEX",
    "NZL", "NOR", "PER", "PHL", "POL", "RUS", "SAU", "SGP", "ZAF",
    "KOR", "SWE", "CHE", "TWN", "THA", "TUR", "USA", "COL", "PAK",
    "IND", "AUT", "BEL", "NLD", "FIN", "FRA", "DEU", "IRL", "ITA",
    "PRT", "ESP", "GRC", "EST"
)
# in 2021, we added a number of additional countries to the adjusted index
regression_addons_2021 <- c(
    "ARE", "CRI", "LKA", "UKR", "URY", "VNM", "GTM", "HND", "NIC",
    "AZE", "BHR", "HRV", "JOR", "KWT", "MDA", "OMN", "QAT", "ROU",
    "SVK", "SVN", "LVA", "LTU"
)
big_mac_gdp_data <- big_mac_gdp_data[iso_a3 %in% regression_countries |
    (iso_a3 %in% regression_addons_2021 & date >= as.Date("2021-01-01"))]

big_mac_gdp_data %>%
    .[, GDP_bigmac := GDP_local / (local_price / .SD[iso_a3 == "USA"]$local_price), by = date]

big_mac_gdp_data[
    ,
    `:=`(
        adj_price = lm(dollar_price ~ GDP_bigmac) %>% predict()
        #         adj_price_USD=lm(dollar_price ~ GDP_dollar) %>% predict
    ),
    by = date
]

big_mac_adj_index <- big_mac_gdp_data[
    !is.na(dollar_price) &
        (
            iso_a3 %in% regression_countries |
                iso_a3 %in% regression_addons_2021 & date >= "2021-01-01"
        ) &
        iso_a3 %in% big_mac_countries,
    .(date, iso_a3, currency_code, name, local_price, dollar_ex, dollar_price, GDP_bigmac, adj_price)
]

for (currency in base_currencies) {
    big_mac_adj_index[
        , # we don't want a subset, so our first argument is blank
        (currency) := # we'll add a new column named for the base set
            ( # we divide the dollar price by the adjusted price to get
                dollar_price / adj_price # the deviation from our expectation by
            ) /
            # the same figure from the *base currency*'s rowa\
            (
                .SD[currency_code == currency]$dollar_price /
                    .SD[currency_code == currency]$adj_price
            ) -
            1, # one means parity (neither over- nor under-valued), so we subtract one
        # to get an over/under-valuation value
        by = date # and of course, we'll group these rows by date
    ]
}
big_mac_adj_index[, (base_currencies) := round(.SD, 5L), .SDcols = base_currencies]

fwrite(big_mac_adj_index, "./output-data/big-mac-adjusted-index.csv")

big_mac_full_index <- merge(big_mac_index, big_mac_adj_index,
    by = c("date", "iso_a3", "currency_code", "name", "local_price", "dollar_ex", "dollar_price"),
    suffixes = c("_raw", "_adjusted"),
    all.x = TRUE
)

fwrite(big_mac_full_index, "./output-data/big-mac-full-index.csv")
